本报告为了解决以下几点问题:
本报告对豆瓣【中国内地电影票房总排行】网站进行爬虫,主要采集以下几方面的数据,爬虫代码见代码部分的crawl.R。
RankTitleRatingAppraiseDirectorActorTypeAreaSalesDatePublishRating为十分制,存在一位小数,且1分、2分左右的电影数量少,因此可以将电影划分成低于3分、3-5分、5-6分、6-7分、7-8分、8-9分、高于9分这7个等级,统计得到每个等级对应的平均票房、社交媒体上的平均评论人数,绘制相关柱形图,分析平均票房与评分、平均评论人数与评分的相关关系。Type上榜电影的数量,并统计各类型电影的总票房,绘制相关饼图,分析更受欢迎和更卖座的电影类型。因为提取到的电影类型Type是以诸如“喜剧/爱情/奇幻”等多类型存储,所以对应的该部电影在三个类别中均上榜,但在计算票房时需按照类别数将票房均分。Actor的出现次数,并绘制词云,因出现1次的演员数量过多且意义不大,所以绘制出现次数2次及以上的主演的词云。以下是数据分析和数据挖掘过程
# Import packages
library(recharts)
library(wordcloud2)
library(stringr)
library(plyr)
library(DT)
# Read the data
movies<-read.csv("..\\data\\movies.csv")
# Delete invalid information
movies<-na.omit(movies)
# ---------- Draw the histogram of Sales_avg and Appraise_avg in different ratings ----------
# Divide ratings into seven ranks
rating_1<-subset(movies,Rating<=3)
rating_2<-subset(movies,Rating>3 & Rating<=5)
rating_3<-subset(movies,Rating>5 & Rating<=6)
rating_4<-subset(movies,Rating>6 & Rating<=7)
rating_5<-subset(movies,Rating>7 & Rating<=8)
rating_6<-subset(movies,Rating>8 & Rating<=9)
rating_7<-subset(movies,Rating>9)
# Build a new frame for different ratings about Sales_avg and Appraise_avg
rating_df<-data.frame(Rating=c("0~3","3~5","5~6","6~7","7~8","8~9","9~10"),
Sales_avg=c(mean(rating_1$Sales),mean(rating_2$Sales),
mean(rating_3$Sales),mean(rating_4$Sales),
mean(rating_5$Sales),mean(rating_6$Sales),
mean(rating_7$Sales)),
Appraise_avg=c(mean(rating_1$Appraise),mean(rating_2$Appraise),
mean(rating_3$Appraise),mean(rating_4$Appraise),
mean(rating_5$Appraise),mean(rating_6$Appraise),
mean(rating_7$Appraise)))
# Draw the histogram of Sales_avg in different ratings
Sales_eBar<-eBar(rating_df,~Rating,~Sales_avg)+
eTitle(title="各评分电影平均票房数柱状图")+
eLegend(show=FALSE)
Sales_eBar
# Draw the histogram of Appraise_avg in different ratings
Appraise_eBar<-eBar(rating_df,~Rating,~Appraise_avg)+
eTitle(title="各评分电影平均评论数柱状图")+
eLegend(show=FALSE)
Appraise_eBar
# ---------- Draw the pie chart of types and sales of types ----------
# Extract the type column and divide by /
type<-movies$Type
type<-as.character(type)
type<-strsplit(type,"/")
# Extract the sales column
sales<-movies$Sales
# Create two new lists of type and sales
type_list<-list()
sales_list<-list()
for(i in 1:nrow(movies)) {
for(j in 1:length(type[[i]])) {
# Take the every type of each movie and combine it into a new list
add_list<-list(type[[i]][j])
type_list<-c(type_list,add_list)
# Divide the box office of each movies by number of types
add_list<-list(sales[[i]]/length(type[[i]]))
sales_list<-c(sales_list,add_list)
}
}
# Build two new frames for type and sales
type_df<-data.frame(unlist(type_list))
sales_df<-data.frame(unlist(sales_list))
# Insert the ID column into type_df and sales_df
ID<-c(1:nrow(type_df))
type_df<-data.frame(ID,type_df)
sales_df<-data.frame(ID,sales_df)
# Merge two data frame by "ID" and delete invalid information
type_sales_df<-merge(type_df,sales_df,by="ID")
type_sales_df<-na.omit(type_sales_df)
# Add frequency in type data frame
type_df<-count(type_df$unlist.type_list.)
# Delete invalid information for the last items
type_df<-type_df[-nrow(type_df),]
# In descending order
type_df<-type_df[order(type_df$freq,decreasing=T),]
# Create a new data frame of sales in each type
sum_sales_list<-list()
for(i in 1:nrow(type_df)) {
sum_sales<-0
for(j in 1:nrow(type_sales_df)) {
if(type_sales_df$unlist.type_list.[j]==type_df$x[i]) {
sum_sales<-sum_sales+type_sales_df$unlist.sales_list.[j]
}
}
sum_sales_list[i]<-sum_sales
}
sum_sales_df<-data.frame(type_df$x,unlist(sum_sales_list))
# Draw the pie chart of the number of type
type_ePie<-ePie(type_df,size=c(650, 430))+eLegend(orient="vertical")+
eTitle(title="各类型电影上榜数量饼图")
type_ePie
# Draw the pie chart of sales in different types
type_sales_ePie<-ePie(sum_sales_df,size=c(650, 430))+eLegend(orient="vertical")+
eTitle(title="各类型电影票房饼图")
type_sales_ePie
# ---------- Draw the cloud of actors ----------
# Extract the actor column and divide by /
actor<-movies$Actor
actor<-as.character(actor)
actor<-strsplit(actor,"/")
# Take the every actor of each movie and combine it into a new list
actor_list<-list()
for(i in 1:nrow(movies)) {
add_list<-list(actor[[i]][1],actor[[i]][2],actor[[i]][3])
actor_list<-c(actor_list,add_list)
}
# Build a new frame for actors
actor_df<-data.frame(unlist(actor_list))
# Add frequency in actor data frame
actor_df<-count(actor_df$unlist.actor_list.)
# Delete invalid information for the head and tail
actor_df<-actor_df[-c(1:7),]
actor_df<-na.omit(actor_df)
# Extract the actor which appeared more than twice
actor_df<-actor_df[which(actor_df$freq >= 2),]
# In descending order
actor_df<-actor_df[order(actor_df$freq,decreasing=T),]
# Draw the cloud
wordcloud2(actor_df,size=0.3,minRotation=-pi/6,maxRotation=pi/6,rotateRatio=0.9)
Rating、评价人数Appraise属性,并将电影的导演Director、主演Actor、类型Type、制片国家/地区Area、发行类别Publish数字化,将票房Sales按照10亿以上、1亿-10亿、5千万-1亿、低于5千万分成四个评价Evaluate:Excellent、Good、Normal、Bad,并对相关属性作归一化处理,归一化的方法采用min-max标准化,最终建立数字化的电影数据表。相关代码见代码部分的digitization.R。# Import packages
library(ISLR)
library(e1071)
# Read the digital data
digital_movies<-read.csv("..\\data\\digital_movies.csv")
Rating、评价人数Appraise、导演Director、主演Actor、类型Type、制片国家/地区Area、发行类别Publish为特征向量,以评价Evaluate为结果变量。# Extract the data except column 1,3 and 10 as feature vector
x=digital_movies[,-c(1,3,10)]
# Extract the data from column 10 as outcome variable
y=digital_movies[,10]
polynomial,损失函数参数coef0为2。# Build model: the kernel is polynomial kernel
model=svm(x,y,kernel="polynomial",gamma=if(is.vector(x)) 1 else 1/ncol(x),coef0=2)
model的相关信息summary(model)
##
## Call:
## svm.default(x = x, y = y, kernel = "polynomial", gamma = if (is.vector(x)) 1 else 1/ncol(x),
## coef0 = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## gamma: 0.1428571
## coef.0: 2
##
## Number of Support Vectors: 611
##
## ( 28 242 211 130 )
##
##
## Number of Classes: 4
##
## Levels:
## Bad Excellent Good Normal
# Test with train data
pred<-predict(model,x)
# Check accuracy
table(pred,y)
## y
## pred Bad Excellent Good Normal
## Bad 86 0 14 32
## Excellent 0 24 2 0
## Good 29 14 428 84
## Normal 28 0 34 97
# Calculate accuracy
table<-as.data.frame(table(pred,y))
count<-0
for(i in 1:nrow(table)) {
if(table$pred[i] == table$y[i]) {
count<-count+table$Freq[i]
}
}
accuracy<-count/nrow(x)
accuracy
## [1] 0.728211
# Visualize (classes by color, SV by crosses):
plot(cmdscale(dist(x)),
col=c("purple","red","orange","green")[as.integer(y)],
pch=c("。","+")[1:200 %in% model$index+1])
legend(-0.96,-0.35,c("Bad","Excellent","Good","Normal"),
col=c("purple","red","orange","green"),lty=1)
# Interactive table for showing data
DT::datatable(movies %>% head(100))
本分析报告结论如下: